{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据库预处理\n",
    "\n",
    "本笔记添加CID域名到化合物中,移除不需要的列."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import json\n",
    "import pandas as pd\n",
    "import re\n",
    "from io import StringIO\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = '/home/regen/projects/cpie_dbs' # 自行修改数据库所在路径"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_identifier(names, pattern):\n",
    "    if pd.notna(names):\n",
    "        match = re.search(pattern, names)\n",
    "        return match.group(0) if match else None\n",
    "        # db_values = [name for name in names.split('|') if name.startswith(identifier)]\n",
    "        # return db_values[0] if db_values else None\n",
    "    return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "url = \"https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi\"\n",
    "headers = {'Content-Type': 'application/json'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = {\n",
    "        \"download\": [\"cid\", \"cmpdsynonym\"],\n",
    "        \"collection\": \"compound\",\n",
    "        \"order\": [\"relevancescore,desc\"],\n",
    "        \"start\": 1,\n",
    "        \"limit\": 10000000,\n",
    "        \"downloadfilename\": \"compounds\",\n",
    "        \"where\": {\n",
    "            \"ands\": []\n",
    "        }\n",
    "    }"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# BindingDB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "bdb_path = os.path.join(data_path, 'BindingDB.tsv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_18131/2636726826.py:1: DtypeWarning: Columns (8,9,10,11,15) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  bdb = pd.read_csv(\n"
     ]
    }
   ],
   "source": [
    "bdb = pd.read_csv(\n",
    "    bdb_path,\n",
    "    sep='\\t',\n",
    "    usecols=[\n",
    "        'Ligand SMILES', 'Ligand InChI', 'BindingDB MonomerID',\n",
    "        'Ligand InChI Key', 'BindingDB Ligand Name',\n",
    "        'Target Name',\n",
    "        'Target Source Organism According to Curator or DataSource', 'Ki (nM)',\n",
    "        'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)', 'pH', 'Temp (C)',\n",
    "        'Curation/DataSource',\n",
    "        'UniProt (SwissProt) Entry Name of Target Chain',\n",
    "        'UniProt (SwissProt) Primary ID of Target Chain'\n",
    "    ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "query['where']['ands'] = [{\"*\": \"bindingdb\"}]\n",
    "pattern= r'\\BDBM\\d+\\b'\n",
    "identifier_column = 'BindingDB MonomerID'\n",
    "\n",
    "# Define parameters in a dictionary/json format\n",
    "parameters = {\n",
    "    \"infmt\": \"json\",\n",
    "    \"outfmt\": \"csv\",\n",
    "    \"query\": json.dumps(query)\n",
    "}\n",
    "\n",
    "response = requests.post(url, params=parameters, headers=headers)\n",
    "response.text\n",
    "csv_file_like = StringIO(response.text)\n",
    "# Read CSV data into a pandas DataFrame\n",
    "data = pd.read_csv(csv_file_like)\n",
    "data = data.rename(columns={' cid': 'CID', 'cmpdsynonym': 'synonyms'})\n",
    "# Extract bdb ID\n",
    "data[identifier_column] = data['synonyms'].apply(lambda x: extract_identifier(x, pattern))\n",
    "data = data.drop(columns='synonyms')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv(os.path.join(data_path, 'BindingDB_from_pubchem.tsv'), index=False)# 用于本地读取pubchem信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# if bdb_from_pubchem:='BindingDB_from_pubchem.tsv' in os.listdir(data_path):\n",
    "#     data = pd.read_csv(os.path.join(data_path, bdb_from_pubchem))\n",
    "# 如果后续单元格处理失败,跳过上2步,将本单元格注视取消,直接从本地获取pubchem信息\n",
    "# 如果要获取最新的pubchem信息,则正常按流程执行笔记"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bdb = pd.merge(bdb, data, on=identifier_column, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove unnecessary columns\n",
    "bdb = bdb[['Ligand SMILES','Ligand InChI','BindingDB MonomerID','Ligand InChI Key','BindingDB Ligand Name',\n",
    "           'Target Name','Target Source Organism According to Curator or DataSource',\n",
    "           'Ki (nM)','IC50 (nM)','Kd (nM)','EC50 (nM)','pH','Temp (C)','Curation/DataSource',\n",
    "           'UniProt (SwissProt) Entry Name of Target Chain','UniProt (SwissProt) Primary ID of Target Chain', 'CID']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "bdb.to_csv(bdb_path, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ChEMBL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "chembl_path = os.path.join(data_path, 'ChEMBL.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_25449/293252638.py:1: DtypeWarning: Columns (45) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  chembl = pd.read_csv(chembl_path, usecols=['Molecule ChEMBL ID','Molecule Name','Smiles','Standard Type','Standard Value',\n"
     ]
    }
   ],
   "source": [
    "chembl = pd.read_csv(chembl_path, usecols=['Molecule ChEMBL ID','Molecule Name','Smiles','Standard Type','Standard Value',\n",
    "                 'Standard Units','pChEMBL Value','Data Validity Comment','Comment',\n",
    "                 'Target ChEMBL ID','Target Name','Target Organism','Target Type','Source ID','Action Type'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "query['where']['ands'] = [{\"*\": \"chembl\"}]\n",
    "pattern = r'\\bCHEMBL\\d+\\b'\n",
    "identifier_column = 'Molecule ChEMBL ID'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define parameters in a dictionary/json format\n",
    "parameters = {\n",
    "    \"infmt\": \"json\",\n",
    "    \"outfmt\": \"csv\",\n",
    "    \"query\": json.dumps(query)\n",
    "}\n",
    "\n",
    "response = requests.post(url, params=parameters, headers=headers)\n",
    "response.text\n",
    "csv_file_like = StringIO(response.text)\n",
    "# Read CSV data into a pandas DataFrame\n",
    "data = pd.read_csv(csv_file_like)\n",
    "data = data.rename(columns={' cid': 'CID', 'cmpdsynonym': 'synonyms'})\n",
    "# Extract chembl ID\n",
    "data[identifier_column] = data['synonyms'].apply(lambda x: extract_identifier(x, pattern))\n",
    "data = data.drop(columns='synonyms')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv(f'{data_path}/chembl_from_pubchem.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "从本地读取pubchem CID信息\n"
     ]
    }
   ],
   "source": [
    "# if 'chembl_from_pubchem.csv' in os.listdir(data_path):\n",
    "#     data = pd.read_csv(os.path.join(data_path, 'chembl_from_pubchem.csv'))\n",
    "#     print('从本地读取pubchem CID信息')\n",
    "\n",
    "# # 如果后续单元格处理失败,跳过上2步,将本单元格注视取消,直接从本地获取pubchem信息\n",
    "# # 如果要获取最新的pubchem信息,则正常按流程执行笔记"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "chembl = pd.merge(chembl, data, on=identifier_column, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove unnecessary columns\n",
    "chembl = chembl[['Molecule ChEMBL ID','Molecule Name','Smiles','Standard Type','Standard Value',\n",
    "                 'Standard Units','pChEMBL Value','Data Validity Comment','Comment',\n",
    "                 'Target ChEMBL ID','Target Name','Target Organism','Target Type','Source ID','Action Type','CID']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "chembl.to_csv(chembl_path, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Drug Target Commons"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "dtc_path = os.path.join(data_path, 'DTC.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_39924/4021045723.py:1: DtypeWarning: Columns (0,1,8,14) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  dtc = pd.read_csv(dtc_path, usecols=['compound_id','standard_inchi_key','target_id','gene_names','wildtype_or_mutant','mutation_info','standard_type','standard_relation','standard_value','standard_units','activity_comment','pubmed_id','doc_type'])\n"
     ]
    }
   ],
   "source": [
    "dtc = pd.read_csv(dtc_path, usecols=['compound_id','standard_inchi_key','target_id','gene_names','wildtype_or_mutant','mutation_info','standard_type','standard_relation','standard_value','standard_units','activity_comment','pubmed_id','doc_type'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "DTC也使用ChEMBL id."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "identifier_columns = 'compound_id'\n",
    "data.rename(columns={'Molecule ChEMBL ID': identifier_columns},inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "dtc = pd.merge(dtc, data, on=identifier_columns, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove unnecessary columns\n",
    "dtc = dtc[['compound_id','standard_inchi_key','target_id','gene_names','wildtype_or_mutant',\n",
    "           'mutation_info','standard_type','standard_relation','standard_value','standard_units',\n",
    "           'activity_comment','pubmed_id','doc_type','CID']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dtc.to_csv(dtc_path, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Comparaive Toxicogenomics Database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ctd_path = os.path.join(data_path, 'CTD.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ctd = pd.read_csv(ctd_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query['where']['ands'] = [{\"*\": \"ctd\"}, {\"*\": \"comparative\"}, {\"*\": \"toxicogenomics\"}, {\"*\": \"database\"}]\n",
    "pattern= r'\\b(C|D)\\d+\\b'\n",
    "identifier_column = 'ChemicalID'\n",
    "\n",
    "# Define parameters in a dictionary/json format\n",
    "parameters = {\n",
    "    \"infmt\": \"json\",\n",
    "    \"outfmt\": \"csv\",\n",
    "    \"query\": json.dumps(query)\n",
    "}\n",
    "\n",
    "response = requests.post(url, params=parameters, headers=headers)\n",
    "response.text\n",
    "csv_file_like = StringIO(response.text)\n",
    "# Read CSV data into a pandas DataFrame\n",
    "data = pd.read_csv(csv_file_like)\n",
    "data = data.rename(columns={' cid': 'CID', 'cmpdsynonym': 'synonyms'})\n",
    "# Extract ctd ID\n",
    "data[identifier_column] = data['synonyms'].apply(lambda x: extract_identifier(x, pattern))\n",
    "data = data.drop(columns='synonyms')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ctd = pd.merge(ctd, data, on=identifier_column, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ctd.to_csv(ctd_path, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DrugBank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "db_path = os.path.join(data_path, 'DB.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "db = pd.read_csv(db_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query['where']['ands'] = [{\"*\": \"drugbank\"}]\n",
    "pattern= r'\\DB\\d+\\b'\n",
    "identifier_column = 'drugbank-id'\n",
    "\n",
    "# Define parameters in a dictionary/json format\n",
    "parameters = {\n",
    "    \"infmt\": \"json\",\n",
    "    \"outfmt\": \"csv\",\n",
    "    \"query\": json.dumps(query)\n",
    "}\n",
    "\n",
    "response = requests.post(url, params=parameters, headers=headers)\n",
    "response.text\n",
    "csv_file_like = StringIO(response.text)\n",
    "# Read CSV data into a pandas DataFrame\n",
    "data = pd.read_csv(csv_file_like)\n",
    "data = data.rename(columns={' cid': 'CID', 'cmpdsynonym': 'synonyms'})\n",
    "# Extract db ID\n",
    "data[identifier_column] = data['synonyms'].apply(lambda x: extract_identifier(x, pattern))\n",
    "data = data.drop(columns='synonyms')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "db = pd.merge(db, data, on=identifier_column, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "db.to_csv(db, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DrugCentral"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dc_path = os.path.join(data_path, 'DC.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dc = pd.read_csv(dc_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# InChIKey is available in DC\n",
    "query_dc = {\n",
    "        \"download\": [\"cid\", \"inchikey\"],\n",
    "        \"collection\": \"compound\",\n",
    "        \"order\": [\"relevancescore,desc\"],\n",
    "        \"start\": 1,\n",
    "        \"limit\": 10000000,\n",
    "        \"downloadfilename\": \"compounds\",\n",
    "        \"where\": {\n",
    "            \"ands\": []\n",
    "        }\n",
    "    }\n",
    "query_dc['where']['ands'] = [{\"*\": \"drugcentral\"}]\n",
    "\n",
    "identifier_column = 'InChIKey'\n",
    "\n",
    "# Define parameters in a dictionary/json format\n",
    "parameters = {\n",
    "    \"infmt\": \"json\",\n",
    "    \"outfmt\": \"csv\",\n",
    "    \"query\": json.dumps(query_dc)\n",
    "}\n",
    "\n",
    "response = requests.post(url, params=parameters, headers=headers)\n",
    "response.text\n",
    "csv_file_like = StringIO(response.text)\n",
    "# Read CSV data into a pandas DataFrame\n",
    "data = pd.read_csv(csv_file_like)\n",
    "data = data.rename(columns={' cid': 'CID', 'inchikey': 'InChIKey'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dc = pd.merge(dc, data, on=identifier_column, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dc.to_csv(dc_path, index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "cpi",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
